ManyFaces Pilot Ratings Data Prep

Code
library(tidyverse)

Project Structure

Read in the full project structure from the project file to map experiment names to numbers.

Code
proj <- jsonlite::read_json("data/project_1136_structure.json")

exp_data <- purrr::map_df(proj, \(comp) {
  if (comp$component_type == "exp") {
     data.frame(
       exp_id = comp$id,
       name = comp$name,
       res_name = comp$res_name,
       instructions = comp$instructions,
       question = comp$question,
       exptype = comp$exptype,
       trial_order = comp$trial_order,
       total_stim = comp$total_stim,
       random_stim = comp$random_stim,
       trials = length(comp$trial),
       stim = length(comp$stim)
     )
  } else {
    NULL
  }
}) |>
  mutate(exp = sub("ManyFaces? Pilot Ratings: ", "", res_name) |> trimws())


trial_data <- purrr::map_df(proj, \(comp) {
  if (comp$component_type == "exp") {
    purrr::map_df(comp$trial, \(trial) {
      data.frame(
        exp_id = comp$id,
        n = trial$trial_n,
        name = trial$name, 
        img_id = trial$center_img,
        img_path = comp$stimuli[[as.character(trial$center_img)]]
      )
    })
  } else {
    NULL
  }
}) |>
  mutate(name = sub("^(manyfaces|attention_checks)/", "", name))
Code
exp_data |>
  select(exp_id, exp, question, trials) |>
  arrange(exp)

Data Prep

Raw Data

This workflow requires the data-raw directory, which is not shared on github.

This is the SQL for downloading the data from Experimentum. We need to download in chunks of 50000 rows to avoid file download limits on the site (not needed if downloading directly from SQL).

SELECT 
  session.id as session_id, project_id, exp.res_name as exp_name, exp_id, 
  session.user_id, user.sex as user_sex, user.status as user_status,
  ROUND(DATEDIFF(ed.dt, REPLACE(birthday, "-00","-01"))/365.25, 1) AS user_age,
  trial.name as trial_name,
  trial_n,
  `order`,
  dv,
  rt,
  ed.side,
  ed.dt
FROM session 
  LEFT JOIN user USING (user_id)
  LEFT JOIN exp_data AS ed ON ed.session_id = session.id
  LEFT JOIN exp ON exp.id = ed.exp_id
  LEFT JOIN trial USING (exp_id, trial_n)
WHERE session.project_id = 1136
  AND user.status IN ("guest", "registered")
  AND exp_id IN (1384, 1400, 1399, 1398, 1401, 1402, 1403, 
                 1404, 1405, 1397, 1390, 1389, 1388, 1387, 
                 1386, 1385, 1382, 1381, 1380, 1379, 1377)
LIMIT 50000
OFFSET 0
Code
# combine multiple downloads into one file
exp_raw <- list.files("data-raw/exp", full.names = TRUE) |> 
  read_csv(show_col_types = FALSE) |>
  unique() |>
  filter(user_status %in% c("guest", "registered"))

write_csv(exp_raw, paste0("data-raw/ManyFaces-Pilot-Ratings-exps_", Sys.Date(), ".csv"))

# get most recent files
exp_file <- list.files("data-raw", "ManyFaces-Pilot-Ratings-exps",
                       full.names = TRUE) |>
  sort(decreasing = TRUE) |>
  pluck(1)

exp_raw <- read_csv(exp_file, show_col_types = FALSE) |>
  filter(user_status %in% c("guest", "registered")) |>
  unique()

# get most recent files
quest_file <- list.files("data-raw", "ManyFaces-Pilot-Ratings-quests", 
                       full.names = TRUE) |>
  sort(decreasing = TRUE) |>
  pluck(1)

quest_raw <- read_csv(quest_file, show_col_types = FALSE) |>
  filter(user_status %in% c("guest", "registered")) |>
  unique()

# write to the data directory
write_csv(exp_raw, "data/manyfaces-pilot-exp.csv")
write_csv(quest_raw, "data/manyfaces-pilot-quest.csv")

Reshape Data

Code
exp_raw <- read_csv("data/manyfaces-pilot-exp.csv", show_col_types = FALSE)
quest_raw <- read_csv("data/manyfaces-pilot-quest.csv", show_col_types = FALSE)
Code
ed <- exp_data |>
  select(exp_id, exp, trials) |> 
  rename(trials_total = trials)

exp_long_raw <- exp_raw |>
  select(session_id, exp_id, trial_name, dv, rt, dt) |>
  unique() |>
  mutate(trial_name = sub("^(manyfaces|attention_checks)/", "", trial_name)) |>
  left_join(ed, by = "exp_id")

Preliminary data cleaning

Remove remove duplicate trials and incomplete data

We start with 2115 participants completing a total of 2158 experiments.

Some participants completed more than the maximum amount of trials in an experiment. Duplicate trials were removed by only retaining participants’ initial ratings of a particular stimulus.

Code
# NOTE: this is not elegant as it takes QUITE long to run. I initially just did the slicing for the subsection of IDs that I knew contained duplicates, but I felt this made code unnecessarily hard to follow. Happy to change back if running time is a concern!
dupl_trials_rm <- exp_long_raw |> 
  group_by(session_id, exp_id, trial_name, trials_total) |> 
  slice_min(dt, n = 1) |> 
  ungroup()
Code
incomplete <- dupl_trials_rm |> 
  count(session_id, exp_id, trials_total) |> 
  filter(n < trials_total)

complete <- dupl_trials_rm |> 
  filter(!session_id %in% incomplete$session_id)

190 participants did not complete all trials of an experiment and were excluded.

Some participants also completed more than one experiment. Data participants provided in a second rating study was removed.

Code
dupl_exp_ids <- complete |> 
  # Mark participants that participated twice and establish which exp they did first
  group_by(session_id, exp_id) |> 
  summarise(first_dt = min(dt), .groups = "drop") |>
  group_by(session_id) |> 
  mutate(n_exp = n()) |>
  mutate(first_exp = exp_id[which.min(first_dt)]) |>
  ungroup() |> 
  filter(n_exp > 1 & exp_id != first_exp)
Code
exp_long <- complete |> 
   anti_join(dupl_exp_ids, by = c("session_id", "exp_id" = "first_exp"))

Prior to pre-registered exclusions, preliminary sample thus consisted of 1936 participants.

Pre-registered exclusions

Overly consistent responses

Participants were excluded based on overly consistent responding, i.e. if they responded to at least 90% of trials identically.

Code
overly_consistent <- exp_long |> 
  summarise(
    same_pcnt = max(tabulate(match(dv, unique(dv)))) / n(),
    .by = c(session_id, exp_id)) |> 
  dplyr::select(session_id, exp_id, same_pcnt) |> 
  filter(same_pcnt >= 0.90)

Overly fast responses

Participants were excluded based on overly fast responding, i.e. if their median reaction time fell below the 1st percentile of the overall distribution of median reaction times.

Code
med_rt <- exp_long |>
  summarise(med_rt = median(rt),
            .by = c(session_id, exp_id))

overly_fast <- med_rt |> 
  filter(med_rt < quantile(med_rt, probs = 0.01))
Code
med_rt |> 
  ggplot(aes(x = med_rt)) +
  geom_histogram(bins = 100) +
  geom_vline(xintercept = quantile(med_rt$med_rt, probs = 0.01), 
             color = "red", linetype = "dashed", linewidth = 1) +
  scale_x_continuous(breaks = seq(0, 10000, 500)) +
  labs(x = "Median Reaction Time (ms)") +
  theme_bw()

Self-reported honesty check

Participants were excluded based on self-reported honesty check, i.e. if they responded not taking the study seriously vs. taking it authentically

Code
honesty_check_failed <- quest_raw |>
  filter(q_name == "try") |>
  select(session_id, honesty_check = dv) |>
  unique() |> 
  filter(honesty_check != 2) |> 
  mutate(exp_id = NA) |>
  relocate(exp_id, .after = session_id)

Attention checks

Participants were excluded if they failed two or more attention checks

Code
attn_checks_failed <- exp_long |>
  select(session_id:dv) |>
  filter(grepl("check", trial_name)) |>
  mutate(check_type = sub("check_[a-z0-9-]+_", "", trial_name),
         check_type = ifelse(exp_id == 1400, substr(check_type, 4, 6), check_type)) |>
  summarise(attn_checks_passed = mean(check_type == dv), 
            .by = c("session_id", "exp_id")) |> 
  filter(attn_checks_passed < 5/7)

Overview of exclusions

Code
exclusions <- overly_consistent |> 
  full_join(overly_fast, by = c("session_id", "exp_id")) |> 
  full_join(honesty_check_failed, by = c("session_id", "exp_id")) |> 
  full_join(attn_checks_failed, by = c("session_id", "exp_id"))
Code
exclusions |> 
  summarise(
    `>90% same response` = sum(!is.na(same_pcnt)),
    `RT below cutoff` = sum(!is.na(med_rt)),
    `Failed honesty check` = sum(!is.na(honesty_check)),
    `Failed 2 or more of 7 attention checks` = sum(!is.na(attn_checks_passed)),
    "Total exclusions" = n()
  ) |> 
  pivot_longer(
    cols = everything(),
    names_to = "Reason for exclusion",
    values_to = "Number of participants"
  )

Exclude 29 participants and remove attention checks from data.

Code
exp <- exp_long |> 
  anti_join(exclusions, by = c("session_id", "exp_id")) |>
  filter(!grepl("check_", trial_name))

After exclusions, there were 1909 participants.

Number of participants per study:

Code
exp |>
  summarise(.by = c(exp, session_id)) |>
  count(exp)
Code
endtimes <- quest_raw |>
  summarise(end = max(endtime), .by = c(session_id))

times <- exp |>
  summarise(start = min(dt), .by = c(session_id)) |>
  left_join(endtimes, by = "session_id") |>
  mutate(duration = interval(start, end) |> as.numeric("minutes"))

Demographics

Code
quest <- quest_raw |> 
  anti_join(exclusions, by = c("session_id")) |>
  select(session_id, q_name, dv, endtime) |>
  unique() |>
  pivot_wider(names_from = q_name, values_from = dv) |>
  mutate(age = as.integer(age))

Age and Gender

Code
ggplot(quest, aes(x = age, fill = gender)) +
  geom_histogram(binwidth = 1) +
  scale_fill_manual(values = c("hotpink", "lightblue", "orchid", "gray"))

Residence

Code
count(quest, residence, sort = TRUE)

Ethnicity

Code
quest |>
  mutate(ethnicity = tolower(ethnicity)) |>
  count(ethnicity, sort = TRUE)

Devices

Code
count(quest, device, sort = TRUE)

Plots

Code
rainbow <- c("firebrick", "darkorange", "goldenrod", "darkgreen", "dodgerblue3", "darkorchid4")

Standardised Neutral Ratings

Code
exp_levels <- c("attractive", "trustworthy", "dominant", 
                "memorable", "gender-typical")

exp |>
  filter(exp_id %in% 1377:1382) |>
  mutate(dv = as.integer(dv),
         exp = factor(exp, exp_levels)) |>
  ggplot(aes(x = dv, fill = exp)) +
  geom_histogram(binwidth = 1, show.legend = FALSE, color = "black") +
  facet_wrap(~exp, ncol = 3, axes = "all_x", drop = FALSE) +
  labs(title = "Standardised Neutral Ratings",
       x = "") +
  scale_x_continuous(breaks = 1:7) +
  scale_fill_manual(values = rainbow, drop = FALSE)

Code
# function to create heatmap visualisations
heatmap <- function(id, label) {
  exp |>
    filter(exp %in% id) |>
    separate(trial_name, c("lab", "id"), extra = "drop") |>
    count(lab, id, dv) |>
    ggplot(aes(x = dv, y = id, fill = n)) +
    geom_tile() +
    facet_wrap(~lab) +
    scale_fill_viridis_c() +
    labs(x = label, y = NULL, 
         title = paste(label, "Ratings")) +
    theme(legend.position = "none", 
          axis.text.x = element_text(angle = 90))
}

Attractiveness

Code
heatmap("attractive", "Attractiveness")

Trustworthiness

Code
heatmap("trustworthy", "Trustworthiness")

Dominance

Code
heatmap("dominant", "Dominance")

Memorableness

Code
heatmap("memorable", "Memorableness")

Gender Typicality

Code
heatmap("gender-typical", "Gender Typicality")

Unstandardised Neutral Ratings

Code
exp_labels <- c("attractive", "trustworthy", "dominant")
exp_levels <- paste(exp_labels, "(unstd)")

exp |>
  filter(exp_id %in% 1397:1399) |>
  mutate(dv = as.integer(dv), 
         exp = factor(exp, exp_levels, exp_labels)) |>
  ggplot(aes(x = dv, fill = exp)) +
  geom_histogram(binwidth = 1, show.legend = FALSE, color = "black") +
  facet_wrap(~exp, ncol = 5, drop = FALSE) +
  labs(title = "Unstandardised Neutral Ratings",
       x = "") +
  scale_x_continuous(breaks = 1:7) +
  scale_fill_manual(values = rainbow, drop = FALSE)

Attractiveness (Unstandardised)

Code
heatmap("attractive (unstd)", "Attractiveness (Unstandardised)")

Trustworthiness (Unstandardised)

Code
heatmap("trustworthy (unstd)", "Trustworthiness (Unstandardised)")

Dominance (Unstandardised)

Code
heatmap("dominant (unstd)", "Dominance (Unstandardised)")

Emotion Ratings

Code
dv_levels <- c("anger", "disgust", "fear", 
                "happiness", "sadness", "surprise", "other")
emo_levels <- c("ang", "dis", "fea", "hap", "sad", "sur")
emo_labels <- paste(dv_levels[1:6], "faces")

exp |>
  filter(exp_id %in% c(1384, 1401:1405)) |>
  separate(trial_name, c("lab", "model", "type", "emo", "view")) |>
  mutate(dv = factor(dv, dv_levels),
         emo = factor(emo, emo_levels, emo_labels)) |>
  ggplot(aes(x = dv, fill = dv)) +
  geom_point(aes(x = x, colour = I(fill), fill = I(fill)), 
             data.frame(emo = factor(emo_levels, emo_levels, emo_labels),
                        x = 1:6,
                        fill = rainbow),
             size = 6.5, y = -60, shape = 18, show.legend = FALSE) +
  geom_bar(color = "transparent") +
  facet_wrap(~emo, axes = "all_x", drop = FALSE) +
  scale_x_discrete(labels = c("A", "D", "F", "H", "S", "U", "O")) +
  scale_fill_manual(values = c(rainbow, "grey"), drop = FALSE) +
  labs(title = "Emotion Ratings",
       x = "",
       fill = "Rated Emotion") +
  coord_cartesian(clip="off") +
  theme(axis.ticks.x = element_blank())

Emotion Intensity Ratings

Code
exp_levels <- c("anger", "disgust", "fear", 
                "happiness", "sadness", "surprise")
               
exp |>
  filter(exp_id %in% 1385:1390) |>
  mutate(dv = as.integer(dv),
         exp = factor(exp, exp_levels)) |>
  ggplot(aes(x = dv, fill = exp)) +
  geom_histogram(binwidth = 1, show.legend = FALSE, color = "black") +
  facet_wrap(~exp, ncol = 3, axes = "all_x", drop = FALSE) +
  labs(title = "Emotion Intensity Ratings",
       x = "") +
  scale_fill_manual(values = rainbow, drop = FALSE) +
  scale_x_continuous(breaks = 1:7)

Age Ratings

Code
dv_levels <- seq(20, 85, 5)
dv_labels <- paste(dv_levels-4, "-", dv_levels )
dv_labels[14] <- "81+"

exp |>
  filter(exp_id %in% 1400) |>
  mutate(dv = factor(dv, dv_levels, dv_labels)) |>
  ggplot(aes(x = dv)) +
  geom_bar(color = "black", fill = "white") +
  scale_x_discrete(drop = FALSE) +
  labs(title = "Age Ratings",
       x = "")

Code
# exp |>
#   filter(exp_id %in% 1400) |>
#   mutate(dv = as.numeric(dv) - 2.5) |>
#   summarise(age = mean(dv), age_sd = sd(dv), .by = trial_name)
Code
exp |>
  filter(exp_id %in% 1400) |>
  mutate(dv = factor(dv, dv_levels, dv_labels)) |>
  mutate(trial_name = gsub("_std_neu_0", "", trial_name)) |>
  separate(trial_name, c("lab", "id")) |>
  count(lab, id, dv) |>
  ggplot(aes(x = dv, y = id, fill = n)) +
  geom_tile() +
  facet_wrap(~lab) +
  scale_fill_viridis_c() +
  labs(x = "Age", y = NULL) +
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 90))